Source Code of org.terrier.structures.merging.StructureMerger

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is StructureMerger.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> (original author) 
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
 */
package org.terrier.structures.merging;


import gnu.trove.TIntIntHashMap;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import org.apache.log4j.Logger;


import org.terrier.compression.BitIn;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.DirectIndex;
import org.terrier.structures.DirectIndexInputStream;
import org.terrier.structures.DirectInvertedOutputStream;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDirectInvertedOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.InvertedIndex;
import org.terrier.structures.InvertedIndexInputStream;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.MetaIndex;
import org.terrier.structures.PostingIndex;
import org.terrier.structures.PostingIndexInputStream;
import org.terrier.structures.SimpleBitIndexPointer;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.postings.BasicIterablePosting;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.Posting;
import org.terrier.structures.postings.PostingIdComparator;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;


/**
 * This class merges the structures created by Terrier, so that
 * we use fewer and larger inverted and direct files.
 * <p>
 * <b>Properties:</b>&lt;ul&gt;
  * <li><tt>lexicon.use.hash</tt> - build a lexicon hash file for new index. Set to <tt>true</tt> by default.</li>
 * <li><tt>merge.direct</tt> - merge the direct indices if both indices have them. Set to <tt>true</tt> by default.</li>
 * @author Vassilis Plachouras and Craig Macdonald
  */
public class StructureMerger {
  
  /** the logger used */
  protected static final Logger logger = Logger.getLogger(StructureMerger.class);  
  /** 
   * A hashmap for converting the codes of terms appearing only in the 
   * vocabulary of the second set of data structures into a new set of 
   * term codes for the merged set of data structures.
   */
  protected TIntIntHashMap termcodeHashmap = null;
  protected boolean keepTermCodeMap = false;
  
  /** The number of documents in the merged structures. */
  protected int numberOfDocuments;
  
  /** The number of pointers in the merged structures. */
  protected long numberOfPointers;
  
  /** The number of terms in the collection. */
  protected int numberOfTerms;


  
  protected boolean MetaReverse = Boolean.parseBoolean(ApplicationSetup.getProperty("merger.meta.reverse", "true"));
  
  /** source index 1 */  
  protected Index srcIndex1; 
  /** source index 2 */
  protected Index srcIndex2; 
  /** destination index */
  protected Index destIndex;


  /** class to use to write direct file */  
  protected Class<? extends DirectInvertedOutputStream> directFileOutputStreamClass = DirectInvertedOutputStream.class;
  protected Class<? extends DirectInvertedOutputStream> fieldDirectFileOutputStreamClass = FieldDirectInvertedOutputStream.class;
  
  /** class to use to write inverted file */
  protected Class<? extends DirectInvertedOutputStream> invertedFileOutputStreamClass = DirectInvertedOutputStream.class;
  /** class to use to write inverted file */
  protected Class<? extends DirectInvertedOutputStream> fieldInvertedFileOutputStreamClass = FieldDirectInvertedOutputStream.class;
  
  /** class to use to read the direct file */
  protected String directFileInputClass = DirectIndex.class.getName();
  /** class to use to read the direct file as a stream */
  protected String directFileInputStreamClass = DirectIndexInputStream.class.getName();
  /** class to use to read the inverted file */
  protected String invertedFileInputClass = InvertedIndex.class.getName();
  /** class to use to read the inverted file as a stream */
  protected String invertedFileInputStreamClass = InvertedIndexInputStream.class.getName();
  
  protected String basicInvertedIndexPostingIteratorClass = BasicIterablePosting.class.getName();
  protected String fieldInvertedIndexPostingIteratorClass = FieldIterablePosting.class.getName();
  protected String basicDirectIndexPostingIteratorClass = BasicIterablePosting.class.getName();
  protected String fieldDirectIndexPostingIteratorClass = FieldIterablePosting.class.getName();
  /**
   * constructor
   * @param _srcIndex1
   * @param _srcIndex2
   * @param _destIndex
   */
  public StructureMerger(Index _srcIndex1, Index _srcIndex2, Index _destIndex)
  {
    this.srcIndex1 = _srcIndex1;
    this.srcIndex2 = _srcIndex2;
    this.destIndex = _destIndex;
    numberOfDocuments = 0;
    numberOfPointers = 0;
    numberOfTerms = 0;
  }
  




  
  /**
   * Sets the output index. This index should have no documents
   * @param _outputIndex the index to be merged to
   */
  public void setOutputIndex(Index _outputIndex) {
    this.destIndex = _outputIndex;
    //invertedFileOutput = _outputName;
  }
  


  /**
   * Merges the two lexicons into one. After this stage, the offsets in the
   * lexicon are ot correct. They will be updated only after creating the 
   * inverted file.
   */
  @SuppressWarnings("unchecked")
  protected void mergeInvertedFiles() {
    try {
      //getting the number of entries in the first document index, 
      //in order to assign the correct docids to the documents 
      //of the second inverted file.
      
      int numberOfDocs1 = srcIndex1.getCollectionStatistics().getNumberOfDocuments();
      int numberOfDocs2 = srcIndex2.getCollectionStatistics().getNumberOfDocuments();
            
      numberOfDocuments = numberOfDocs1 + numberOfDocs2;
      
      
      final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
      final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
      if (srcFieldCount1 != srcFieldCount2)
      {
        throw new Error("FieldCounts in source indices must match");
      }
      
      final int fieldCount = srcFieldCount1;
      
      //creating a new map between new and old term codes
      if (keepTermCodeMap)
        termcodeHashmap = new TIntIntHashMap();


      //setting the input streams
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 = 
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 = 
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex2.getIndexStructureInputStream("lexicon");
      
      for(String property : new String[] {"index.inverted.fields.names", "max.term.length", "index.lexicon-keyfactory.class", "index.lexicon-keyfactory.parameter_values",
          "index.lexicon-keyfactory.parameter_types", "index.lexicon-valuefactory.class", "index.lexicon-valuefactory.parameter_values",
          "index.lexicon-valuefactory.parameter_types"} )
      {
        destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
      }
      
      FixedSizeWriteableFactory<LexiconEntry> lvf = 
        (FixedSizeWriteableFactory<LexiconEntry>)srcIndex1.getIndexStructure("lexicon-valuefactory");
        
      //setting the output stream
      LexiconOutputStream<String> lexOutStream = 
        new FSOMapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());


      int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); 
      
      PostingIndex inverted1 = srcIndex1.getInvertedIndex();
      PostingIndex inverted2 = srcIndex2.getInvertedIndex();
      
      DirectInvertedOutputStream invOS =null;
      try{
        invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +  
            destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
        
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }




      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;
      Map.Entry<String,LexiconEntry> lee1 = null;
      Map.Entry<String,LexiconEntry> lee2 = null;
      hasMore1 = lexInStream1.hasNext();
      if (hasMore1)
        lee1 = lexInStream1.next();
      hasMore2 = lexInStream2.hasNext();
      if (hasMore2)
        lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
    
        term1 = lee1.getKey();
        term2 = lee2.getKey();
        
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          //write to inverted file postings for the term that only occurs in 1st index
          BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        
        } else if (lexicographicalCompare > 0) {
          //write to inverted file postings for the term that only occurs in 2nd index
          //docids are transformed as we go.
          BitIndexPointer newPointer = 
            invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        } else {
          //write to postings for a term that occurs in both indices
          
          //1. postings from the first index are unchanged
          IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
          BitIndexPointer newPointer1 = invOS.writePostings(ip1);
          
          //2. postings from the 2nd index have their docids transformed
          IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
          BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
          
          numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
            
          //don't set numberOfEntries, as LexiconEntry.add() will take care of this.
          lee1.getValue().setPointer(newPointer1);
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
          
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
          
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }
      }
      
      if (hasMore1) {
        lee2 = null;
        while (hasMore1) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        }
      } else if (hasMore2) {
        lee1 = null;
        while (hasMore2) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }    
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
      


      inverted1.close();
      inverted2.close();
      invOS.close();
      
      destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
      destIndex.addIndexStructure(
            "inverted",
            invertedFileInputClass,
            "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class", 
            "index,structureName,document,"+ 
              (fieldCount > 0
                ? fieldInvertedIndexPostingIteratorClass
                : basicInvertedIndexPostingIteratorClass ));
          destIndex.addIndexStructureInputStream(
                      "inverted",
                      invertedFileInputStreamClass,
                      "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                      "index,structureName,lexicon-entry-inputstream,"+
                        (fieldCount > 0
                          ? fieldInvertedIndexPostingIteratorClass
                : basicInvertedIndexPostingIteratorClass ));
          destIndex.setIndexProperty("index.inverted.fields.count", ""+fieldCount);
      lexOutStream.close();
      if (fieldCount > 0)
      {
        destIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
      }
      destIndex.flush();
                
    } catch(IOException ioe) {
      logger.error("IOException while merging lexicons and inverted files.", ioe);
    }
  }




  /**
   * Merges the two direct files and the corresponding document id files.
   */
  @SuppressWarnings("unchecked")
  protected void mergeDirectFiles() {
    try {
      final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document");
      
      final String[] metaTags = ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.key-names", "docno"));
      final int[] metaTagLengths = ArrayUtils.parseCommaDelimitedInts(srcIndex1.getIndexProperty("index.meta.value-lengths", "20"));
      final String[] metaReverseTags = MetaReverse
        ? ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.reverse-key-names", "docno"))
        : new String[0];
      final MetaIndexBuilder metaBuilder = new CompressingMetaIndexBuilder(destIndex, metaTags, metaTagLengths, metaReverseTags);
    
      if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
      {
        throw new Error("Meta fields in source indices must match");
      }
      final BitIndexPointer emptyPointer = new SimpleBitIndexPointer();
      
        
      final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      if (srcFieldCount1 != srcFieldCount2)
      {
        throw new Error("FieldCounts in source indices must match");
      }
      
      final int fieldCount = srcFieldCount1;
      
      
      for(String property : new String[] {"index.direct.fields.names","index.direct.fields.count" } )
      {
        destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
      }
      
      DirectInvertedOutputStream dfOutput = null;
      try{
        dfOutput = 
          (fieldCount > 0 ? fieldDirectFileOutputStreamClass : directFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +  
                destIndex.getPrefix() + ".direct" + BitIn.USUAL_EXTENSION);
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }
      
      
      final Iterator<DocumentIndexEntry> docidInput1 = (Iterator<DocumentIndexEntry>)srcIndex1.getIndexStructureInputStream("document");
      final PostingIndexInputStream dfInput1 = (PostingIndexInputStream)srcIndex1.getIndexStructureInputStream("direct");
      final MetaIndex metaInput1 = srcIndex1.getMetaIndex();
      
      int sourceDocid = 0;
      //traversing the direct index, without any change
      while(docidInput1.hasNext())
      {
        BitIndexPointer pointerDF = emptyPointer;
        DocumentIndexEntry die = docidInput1.next();
        if (die.getDocumentLength() > 0)
        {
          pointerDF = dfOutput.writePostings(dfInput1.next());
        }
        die.setBitIndexPointer(pointerDF);
        docidOutput.addEntryToBuffer(die);
        metaBuilder.writeDocumentEntry(metaInput1.getAllItems(sourceDocid));
        sourceDocid++;
      }
      dfInput1.close();
      metaInput1.close();
      IndexUtil.close(docidInput1);
      final Iterator<DocumentIndexEntry> docidInput2 = (Iterator<DocumentIndexEntry>)srcIndex2.getIndexStructureInputStream("document");
      final PostingIndexInputStream dfInput2 = (PostingIndexInputStream)srcIndex2.getIndexStructureInputStream("direct");
      final MetaIndex metaInput2 = srcIndex2.getMetaIndex();
      
      sourceDocid = 0;
      while (docidInput2.hasNext())
      {
        DocumentIndexEntry die = docidInput2.next();
      
        BitIndexPointer pointerDF = emptyPointer;
        if (die.getDocumentLength() > 0)
        {
          final IterablePosting postings = dfInput2.next();
          
          List<Posting> postingList = new ArrayList<Posting>();
          while(postings.next() != IterablePosting.EOL)
          {
            final Posting p = postings.asWritablePosting();
            p.setId(termcodeHashmap.get(postings.getId()));
            postingList.add(p);
          }
          Collections.sort(postingList, new PostingIdComparator());
          pointerDF = dfOutput.writePostings(postingList.iterator());
        }
        die.setBitIndexPointer(pointerDF);
        docidOutput.addEntryToBuffer(die);
        metaBuilder.writeDocumentEntry(metaInput2.getAllItems(sourceDocid));
        sourceDocid++;
      }
      dfInput2.close();
      IndexUtil.close(docidInput2);
      metaInput2.close();
      
      metaBuilder.close();
      dfOutput.close();
      docidOutput.finishedCollections();
      docidOutput.close();


      destIndex.addIndexStructure(
          "direct", 
          "org.terrier.structures.DirectIndex", 
          "org.terrier.structures.Index,java.lang.String,java.lang.Class", 
          "index,structureName,"+ 
            (fieldCount > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
      destIndex.addIndexStructureInputStream(
          "direct",
          "org.terrier.structures.DirectIndexInputStream", 
          "org.terrier.structures.Index,java.lang.String,java.lang.Class",
          "index,structureName,"+ 
            (fieldCount > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
      
      if (fieldCount > 0)
      {
        destIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
      }
      else
      {
        destIndex.addIndexStructure("document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
      }
      destIndex.flush();
      
    } catch(IOException ioe) {
      logger.error("IOException while merging df and docid files.", ioe);
    }
  }
  
  protected static Class<?>[] getInterfaces(Object o)
  {
    List<Class<?>> list = new ArrayList<Class<?>>();
    Class<?> c = o.getClass();
    while(! c.equals(Object.class))
    {
      for(Class<?> i : c.getInterfaces())
      {
        list.add(i);
      }
      c = c.getSuperclass();
    }
    return list.toArray(new Class[0]);
  }


  
  /**
   * Merges the two document index files, and the meta files.
   */
  @SuppressWarnings("unchecked")
  protected void mergeDocumentIndexFiles() {
    try {
      //the output docid file
      final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document");
      final String[] metaTags = ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.key-names", "docno"));
      final int[] metaTagLengths = ArrayUtils.parseCommaDelimitedInts(srcIndex1.getIndexProperty("index.meta.value-lengths", "20"));
      final String[] metaReverseTags = MetaReverse
        ? ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.reverse-key-names", "docno"))
        : new String[0];
      final MetaIndexBuilder metaBuilder = new CompressingMetaIndexBuilder(destIndex, metaTags, metaTagLengths, metaReverseTags);
    
      if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
      {
        throw new Error("Meta fields in source indices must match");
      }
      
      //opening the first set of files.
      final Iterator<DocumentIndexEntry> docidInput1 = (Iterator<DocumentIndexEntry>)srcIndex1.getIndexStructureInputStream("document");
      final Iterator<String[]> metaInput1 = (Iterator<String[]>)srcIndex1.getIndexStructureInputStream("meta");
      
      int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.inverted.fields.count", 0);
      int srcFieldCount2 = srcIndex2.getIntIndexProperty("index.inverted.fields.count", 0);
      if (srcFieldCount1 != srcFieldCount2)
      {
        throw new Error("FieldCounts in source indices must match");
      }
      if (srcIndex1.getIndexProperty("index.document-factory.class", "").equals("org.terrier.structures.SimpleDocumentIndexEntry$Factory")
        || srcIndex1.getIndexProperty("index.document-factory.class", "").equals("org.terrier.structures.BasicDocumentIndexEntry$Factory"))
      {
        //for some reason, the source document index has not fields. so we shouldn't assume that fields are being used.
        srcFieldCount1 = 0;
      }
      final int fieldCount = srcFieldCount1;
      
      //traversing the first set of files, without any change
      while(docidInput1.hasNext())
      {
        metaInput1.hasNext();
        DocumentIndexEntry die = docidInput1.next();
        DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die);
        docidOutput.addEntryToBuffer(dieNew);
        metaBuilder.writeDocumentEntry(metaInput1.next());
      }
      
      final Iterator<DocumentIndexEntry> docidInput2 = (Iterator<DocumentIndexEntry>)srcIndex2.getIndexStructureInputStream("document");
      final Iterator<String[]> metaInput2 = (Iterator<String[]>)srcIndex2.getIndexStructureInputStream("meta");
      //traversing the 2nd set of files, without any change
      while(docidInput2.hasNext())
      {
        metaInput2.hasNext();
        DocumentIndexEntry die = docidInput2.next();
        DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die);
        docidOutput.addEntryToBuffer(dieNew);
        metaBuilder.writeDocumentEntry(metaInput2.next());
      }
      
      docidOutput.finishedCollections();
      docidOutput.close();
      metaBuilder.close();
      IndexUtil.close(docidInput1);
      IndexUtil.close(docidInput2);
      //destIndex.setIndexProperty("index.inverted.fields.count", ""+ fieldCount);
      if (fieldCount > 0)
      {
        destIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
      }
      else
      {
        destIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
      }
      destIndex.flush();
      
    } catch(IOException ioe) {
      logger.error("IOException while merging docid files.", ioe);
    }
  }


  


  /** 
   * creates the final term code to offset file, and the lexicon hash if enabled.
   */
  protected void createLexidFile() {
    LexiconBuilder.optimise(destIndex, "lexicon");
  }
  
  /**
   * Merges the structures created by terrier.
   */
  public void mergeStructures() {
    final boolean bothInverted = srcIndex1.hasIndexStructure("inverted") && srcIndex2.hasIndexStructure("inverted");
    final boolean bothDirect = srcIndex1.hasIndexStructure("direct") && srcIndex2.hasIndexStructure("direct");
    final boolean bothLexicon = srcIndex1.hasIndexStructure("lexicon") && srcIndex2.hasIndexStructure("lexicon");
    final long t1 = System.currentTimeMillis();
    keepTermCodeMap = bothDirect;
    long t2 = 0;
    long t3 = 0;
    long t4 = 0;
    if (bothInverted)
    {
      mergeInvertedFiles();
      t2 = System.currentTimeMillis();
          //logger.info("merged inverted files in " + ((t2-t1)/1000.0d));
    }
    else if (bothLexicon)
    {
      new LexiconMerger(srcIndex1, srcIndex2, destIndex).mergeLexicons();
      t2 = System.currentTimeMillis();
          //logger.info("merged lexicons in " + ((t2-t1)/1000.0d));
    }
    else
    {
      //logger.warn("No inverted or lexicon - no merging of lexicons took place");
      t2 = System.currentTimeMillis();
    }
    
    if (bothInverted || bothLexicon)
    {
      createLexidFile();
      t3 = System.currentTimeMillis();
      logger.debug("created lexid file and lex hash in " + ((t3-t2)/1000.0d));
    }
    t3 = System.currentTimeMillis();


    if (! bothDirect || ApplicationSetup.getProperty("merge.direct","true").equals("false"))
    {  
      mergeDocumentIndexFiles();
      t4 = System.currentTimeMillis();
      //logger.info("merged documentindex files in " + ((t4-t3)/1000.0d));
    } 
    else 
    {
      mergeDirectFiles();  
      t4 = System.currentTimeMillis();
      //logger.info("merged direct files in " + ((t4-t3)/1000.0d));
    }
  
    if (keepTermCodeMap)
    {
      //save up some memory
      termcodeHashmap.clear();
      termcodeHashmap = null;
    }
  }


  /** Usage: java org.terrier.structures.merging.StructureMerger [binary bits] [inverted file 1] [inverted file 2] [output inverted file] <p>
      * Binary bits concerns the number of fields in use in the index. */
  public static void main(String[] args) throws Exception {
    
    if (args.length != 6)
    {
      logger.fatal("usage: java org.terrier.structures.merging.StructureMerger srcPath1 srcPrefix1 srcPath2 srcPrefix2 destPath1 destPrefix1 ");
      logger.fatal("Exiting ...");
      return;
    }
    
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index indexSrc1 = Index.createIndex(args[0], args[1]);
    Index indexSrc2 = Index.createIndex(args[2], args[3]);
    Index indexDest = Index.createNewIndex(args[4], args[5]);
    
    StructureMerger sMerger = new StructureMerger(indexSrc1, indexSrc2, indexDest);
    long start = System.currentTimeMillis();
    //logger.info("started at " + (new Date()));
    if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) {
      System.err.println("Use LexiconMerger");
      return;
    } else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) {
      sMerger.mergeDocumentIndexFiles();
    } else {
      sMerger.mergeStructures();
    }
    indexSrc1.close();
    indexSrc2.close();
    indexDest.close();
    
    //logger.info("finished at " + (new Date()));
    long end = System.currentTimeMillis();
    //logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec.");
  }








}
Source Code of org.terrier.structures.merging.StructureMerger

Related Classes of org.terrier.structures.merging.StructureMerger